Results
Random Forest SSL
# Load required libraries
library(randomForest)
library(caret)
library(caTools)
library(ggplot2)
# Set working directory
setwd("INSERT_PATH_HERE")
# ----------------------------------------
# Data Preparation
# ----------------------------------------
# Load training dataset
data<- read.csv("TrainingData.csv", sep=",", header= TRUE)
# Turn your categorical variable into factor (e.g., ecotype).
data<- transform(data,ecotype=as.factor(ecotype))
train= data
# ----------------------------------------
# Train the RF model
# ----------------------------------------
# replace x by best mtry for your data - Replace xxxxx by the best ntree for your data
Model_Tunned <- randomForest(ecotype ~ ., data = train, mtry =x , importance=TRUE, ntree = xxxxx, tuneLength = 5, metric = "ROC", trControl = ctrl,strata = data$ecotype)
# Display variable importance
importance(Model_Tunned)
varImpPlot(Model_Tunned)
# ----------------------------------------
# Test the RF Model
# ----------------------------------------
# Load test dataset
dataTest<- read.csv("TestingData.csv", sep=",", header= TRUE)
# Make predictions
pred = predict(Model_Tunned, newdata=dataTest)
# Calculate accuracy
sum(pred==dataTest$ecotype) / nrow(dataTest)
# Create Confusion Matrix
cm = table(dataTest[,x], pred) # x correspond to the column where the ecotypes are.
conf_matrix_df <- confusionMatrix(cm, positive = NULL,prevalence = NULL)
conf_matrix_df
HCA SSL
# Load required libraries
library(factoextra)
library(cluster)
library(ggdendro)
library(ggplot2)
library(ape)
library(dendextend)
# ----------------------------------------
# Data Preparation
# ----------------------------------------
# Load dataset (Ensure the CSV file is in the working directory)
data<- read.csv("PCoordBentSym.csv", sep=",", header= TRUE)
# Convert Your categorical variable to a factor
data <- transform(data,YourGroup=as.factor(YourGroup))
# ----------------------------------------
# Data Scaling
# ----------------------------------------
# Standardize the data
data.scaled <- scale(data)
# Compute Euclidean distance matrix
d <- dist(data.scaled, method = "euclidean")
# ----------------------------------------
# Optimal Cluster Number Selection (Gap Statistic)
# ----------------------------------------
gap_stat <- clusGap(scaled_data, FUN = hcut, nstart = 25, K.max = 20, B = 200)
# Visualize the gap statistic
fviz_gap_stat(gap_stat)
# ----------------------------------------
# Hierarchical Clustering
# ----------------------------------------
# Compute hierarchical clustering using Ward’s method
final_clust <- hclust(d, method = "ward.D2" )
# Cut tree into predefined clusters (replace x with the determined optimal number)
groups <- cutree(final_clust, k=x)
# ----------------------------------------
# Save Cluster Results
# ----------------------------------------
YourGroup<- c(data$YourGroup)
LabID<- c(data$LabID)
# Combine LabID, assigned cluster groups, and original Ecotype
results<-cbind(LabID, CLUSTER, YourGroup)
# Convert into a data Frame
results1<-as.data.frame(results)
colnames(results1)<-c("LabID",'CLUSTER','YourGroup')
# Export results to CSV
write.table(results1, file="PredictedClusters.csv", row.names = FALSE)
# ----------------------------------------
# Dendrogram Visualization
# ----------------------------------------
# Assign labels to the dendrogram
final_clust$labels <- data$YourGroup
# Create dendrogram plot - Replace x by determines k number
dend_plot<- fviz_dend(final_clust, rect = TRUE, cex = 0.5, k = x,
main = "YourName",
xlab = "YourGroup", ylab = "Distance", sub = "",
ggtheme = theme_minimal(), k_colors = "simpsons",
color_labels_by_k = FALSE, type = "rectangle")
# Display the dendrogram
dend_plot
# Save the dendrogram as an image
ggsave("dendrogram.png", plot = dend_plot)
A local Image
2B-PLS
# Set working directory (modify as needed)
setwd("INSERT_PATH_HERE")
# Load required libraries
library(readxl)
library(geomorph)
# ----------------------------------------
# Load Data
# ----------------------------------------
# Load the shape data (symmetry-adjusted)
load("YourName_sym.dt.bin")
Shape <- YourName_sym.dt$symm.sh
# Load environmental and Polygon data
Polygon <- read_excel("path/to/polygon.xlsx")
Env_Variable <- read_excel("path/to/Environmental_Var.xlsx")
# ----------------------------------------
# Data Preparation
# ----------------------------------------
# Convert environmental variables to numeric
X1 <- Shape
Y1 <- as.data.frame(lapply(Env_Variable, as.numeric))
# ----------------------------------------
# Data Scaling
# ----------------------------------------
scaled_Y1 <- scale(Y1)
# ----------------------------------------
# PLS Analysis
# ----------------------------------------
PLS1 <-two.b.pls(X1,scaled_Y1,iter=999, seed=NULL, print.progress = TRUE)
# Display summary statistics
summary(PLS1)
# Cumulative variance explained
cumsum(explvar(PLS1))
# ----------------------------------------
# PLS Scatter Plot
# ----------------------------------------
# Define color palette
cols <- c("YourColour1", "YourColour2", "YourColour3", "etc")
# Generate PLS plot
P <- plot(PLS1, col = cols[as.factor(YourGroup$YourGroup)], pch = 19, cex = 1.5)
# Add legend
legend("topleft", legend = unique(as.factor(YourGroup$YourGroup)),
col=c(unique(cols[as.factor(YourGroup$YourGroup)])),
pt.cex = 2, cex = 1, pch = 19)
# ----------------------------------------
# PLS Loadings Barplot
# ----------------------------------------
# Extract and sort loadings
loadings_X <- PLS1$right.pls.vectors[,1]
sorted_indices <- order(abs(loadings_X), decreasing = TRUE)
sorted_loadings <- loadings_X[sorted_indices]
# Create a sorted environmental variable names vector
sorted_env_variables <- colnames(Env_Variable)[sorted_indices]
# Generate barplot
barplot(sorted_loadings, beside = TRUE,
main = "YourName",
xlab = "Shape", ylab = "Environment",
names.arg = sorted_env_variables,
col = "magenta", border = "white", space = 0.2, horiz = TRUE, las = 2)
A local Image
RDA
# Set working directory (modify as needed)
setwd("INSERT_PATH_HERE")
# Load required libraries
library(vegan)
library(geomorph)
library(readxl)
library(ggvegan)
library(ggplot2)
library(vegan3d)
# ----------------------------------------
# Load Data
# ----------------------------------------
# Load shape data
load("YourName_sym.dt.bin")
Shape <- YourName_sym.dt$symm.sh
# Convert shape data to a 2D array
ShapesData <- two.d.array(Shape, sep = ".")
# Load environmental and regional data
Polygon <- read_excel("path/to/polygon.xlsx")
Env_Variable <- read_excel("path/to/Environmental_Var.xlsx")
# ----------------------------------------
# Data Preparation
# ----------------------------------------
Ecotype <- Ecotype
# Convert environmental variables to numeric - Replace numbers by your column
Env_variable_Numeric <- sapply(Env_Variable[1:17], as.numeric)
# ----------------------------------------
# Data Scaling
# ----------------------------------------
Scaled_Env_Variable <- scale(Env_variable_Numeric)
# Convert to dataframe if necessary
if (!is.data.frame(Scaled_Env_Variable)) {
Scaled_Env_Variable <- as.data.frame(Scaled_Env_Variable)
}
# ----------------------------------------
# RDA Analysis
# ----------------------------------------
rda_result <- rda(ShapesData ~ ., data = Scaled_Env_Variable)
# Display summary statistics
summary(rda_result)
# ----------------------------------------
# Significance Testing
# ----------------------------------------
Anova.RDA.Overall <- anova.cca(rda_result) # Overall significance
Anova.RDA.terms <- anova.cca(rda_result, by = "terms") # Individual predictor significance
Anova.RDA.margin <- anova.cca(rda_result, by = "margin") # Overall contribution of terms
Anova.RDA.onedf <- anova.cca(rda_result, by = "onedf") # Collective explanatory power
Anova.RDA.axis <- anova.cca(rda_result, by = "axis") # Axis significance
# ----------------------------------------
# Variance Explained by Each Environmental Variable
# ----------------------------------------
# Define variances - Replace values by your values
variances <- c(
SalinityMean = 0.00005464, SalinityRange = 0.00006634, SilicateMean = 0.00008201,
TemperatureMean = 0.00005589, Aspect = 0.00002771, MLDepthMean = 0.00002539,
NitrateMean = 0.00000366, PhMean = 0.00001173, Slope = 0.00000604,
ChlorophyllMean = 0.00002295, DissolvedO2Mean = 0.0000164, DissolvedO2Range = 0.00000993,
CurrentDirectionMean = 0.00001241, CurrentDirectionRange = 0.00004396,
CurrentVelocityMean = 0.00000776, BathymetryMean = 0.00001218, TopographicPosition = 0.00000297)
# Calculate total variance
total_variance <- sum(variances)
# Calculate percentage of variance explained by each variable
percent_variance <- (variances / total_variance) * 100
# Display the results
percent_variance
# ----------------------------------------
# RDA Biplots
# ----------------------------------------
# Standard 2D biplot
autoplot(rda_result, arrows = TRUE, geom = "text", legend = "none")
# 3D biplot using vegan3d
cols <- c("YourColour1","YourColour2","YourColour3", "etc")
ordirgl(rda_result, display = "site", cex = 0.5, choices = 1:3,
ax.col = "red", arr.len = 0.05, arr.col = "blue", pch = 16,
col = cols[as.factor(Ecotype$Ecotype)])
# Alternative 3D plot
ordiplot3d(rda_result, display = "site", choices = 1:3, ax.col = "black",
arr.len = 0.1, arr.col = "green", col = cols[as.factor(Ecotype$Ecotype)], pch = 20)
# ----------------------------------------
# Statistical Tests on Covariates
# ----------------------------------------
# Permutation test for axis significance
permutation_test <- anova.cca(rda_result)
# Extract p-values for each axis
axis_pvalues <- permutation_test$CCA$pvals
# Permutation test for individual variable significance
variable_pvalues <- anova.cca(rda_result, by = "terms")$CCA$pvals
# ----------------------------------------
# Correlation Between Environmental Variables
# ----------------------------------------
Correlation <- cor(Env_Variable)
# Save correlation matrix
write.table(Correlation, file = "Env_Variable_Correlation.tsv", sep = "\t")
# Identify variables with high correlation (≥ 0.6)
high_correlation_variables <- which(Correlation >= 0.6 & Correlation < 1, arr.ind = TRUE)
# Display results
print(high_correlation_variables)
A local Image